import pandas as pd
import re
import os
from collections import defaultdict

#Run file in the script directory

cwd = os.getcwd()
path = cwd[:-7]

os.chdir(path)

data = pd.read_excel('Data/Processed_Data/Docket_Day.xlsx')

#Clean spaces and parentheses from docket numbers

dockets = []
errors=set()
i=0
texts = []
for docket in data['Docket Number']:
    docket = str(docket).replace('±','–').replace('– ','–').replace('-','–')
    d = re.findall('[0-9]{2}–[0-9]{1,5}',docket)
    d1 = re.findall(r'[0-9]{2}[AM]\s{0,1}[0-9]{1,5}',docket)
    d2 = re.findall(r'[0-9]{1,3},{0,1}\s{1,2}Orig',docket)
    d3 = re.findall('[AMSD]{1}–[0-9]{1,5}',docket)
    if d !=[]:
        dockets.append(d[0])      
    elif d1 !=[]:
        dockets.append(d1[0].replace(' ',''))  
    elif d2 !=[]:
        num = re.findall(r'\d+',d2[0])
        term = str(data['Term'][i])
        if len(term)==1:
            term = '0'+term
        dockets.append(term+'O'+num[0])
    elif d3!=[]:
        dockets.append(d3[0])
    else:
        dockets.append(docket)
        errors.add(docket)
    i+=1
for text in data['Text']:
    text=re.sub(r'[Jj][Nn][Ll].+?Y,\s{1,5}[0-9]{0,4}','',text)
    text = re.sub('[Jj][Nn][Ll].+?MILES[0-9]{0,4}','',text)
    texts.append(text.strip())
    
data['Cleaned Docket']=dockets
data['Text']=texts

#Fix spacing errors and typos in the text

text2 = []
j=0
for text in data['Text']:
    text=text.replace('®','fi').replace('out–of–time','out of time').replace('ﬁ','fi')
    text=text.replace('–','').replace('– ','').replace('– ','').replace('habeascorpus','habeas corpus')
    text = text.replace('ti on','tion').replace('P eti','Peti').replace('certiorar i','certiorari').replace('paupe ris','pauperis').replace('pauper is','pauperis')
    text = text.replace('appea lability','appealability').replace('Judg ment','Judgment').replace('rehear ing','rehearing')
    text = text.replace('Disbar ment','Disbarment').replace('to fle','to file').replace('deny ing','denying')
    text = re.sub(r'writ\s*of\s*certiorari\s*out\s*of\s*time','writ of certiorari out of time',text)
    text = re.sub(r'The\s*Solicitor\s*General','The Solicitor General',text)
    text = re.sub(r'in\s*forma\s*pauperis','in forma pauperis',text)
    text = re.sub(r'Petition\s*for\s*a*\s*writ\s*of\s*certiorari\s*','Petition for writ of certiorari',text)
    text = re.sub(r'Petitions\s*for\s*writs\s*of\s*certiorari\s*','Petitions for writs of certiorari',text)
    text = re.sub(r'views\s*of\s*the\s*United\s*States','views of the United States',text)
    text = re.sub(r'brief\s*as\s*amicus\s*curiae','brief as amicus curiae',text)
    text = re.sub(r'brief\s*as\s*amici\s*curiae','brief as amici curiae',text)
    text = text.replace('dividedargument','divided argument').replace('outofti me','out of time').replace('supplement al','supplemental')
    text = text.replace('injunc tion','injunction').replace('appoi ntment','appointment ').replace('Specia l Master','Special Master')
    text = text.replace('appointmentof','appointment of').replace('Appl ication ','Application').replace('Genera l','General')   
    text = re.sub(r'Judgment\s*vacated,{0,1}\s*and\s*case\s*remanded','Judgment vacated and case remanded',text)
    text = re.sub(r'The\s*judgment\s*is\s*vacated,{0,1}\s*and\s*the\s*case\s*is\s*remanded','The judgment is vacated and the case is remanded',text)
    text = re.sub(r'The\s*judgment\s*is\s*vacated,{0,1}\s*and\s*the\s*cases\s*are\s*remanded','The judgment is vacated and the cases are remanded',text)
    text = re.sub(r'Judgment\s*vacated,{0,1}\s*and\s*cases\s*remanded','Judgment vacated and cases remanded',text)
    text = re.sub(r'Judgments\s*vacated,{0,1}\s*and\s*cases\s*remanded','Judgments vacated and cases remanded',text)
    text = re.sub(r'Petition\s*for\s*writ\s*of\s*certiorari\s*granted','Petition for writ of certiorari granted',text)
    text = re.sub(r'Petitions\s*for\s*writs\s*of\s*certiorari\s*granted','Petitions for writs of certiorari granted',text)
    text = re.sub(r'petitions{0,1}\s*for\s*writs{0,1}\s*of\s*certiorari','petition for writ of certiorari',text)
    text = re.sub(r'petition\s*for\s*a\s*writ\s*of\s*certiorari','petition for a writ of certiorari',text)
    text = re.sub(r'i\s*n\s*f\s*o\s*r\s*m\s*a\s*p\s*a\s*u\s*p\s*e\s*r\s*i\s*s','in forma pauperis',text)
    text = re.sub(r'\s*c\s*e\s*r\s*t\s*i\s*o\s*r\s*a\s*r\s*i\s*u\s*n\s*d\s*e\s*r\s*s\s*e\s*a\s*l',
                  'certiorari under seal',text)

#Assign last action_text to previous dockets
    if text[-1]==';' or text[-5:] == '; and' or text[-4:] == ';and':
        j+=1
    else:
        j+=1
        for i in range(j):
            text2.append(text)
        j=0
actions = [r'The\s{0,1}motion',r'The\s{0,1}ap{2,3}lication',r'Petitions{0,1}\s{0,1}for',r'Petitions{0,1}\s{0,1}of',
           'Motions{0,1}','The Solicitor General','Applications{0,1}',r'Petitions{0,1}\s{0,1}to',
           r'The\s{0,1}petitions{0,1}','Judgment vacated',r'[jJ]oint\s{0,1}motion',
           'The Acting Solicitor General','Certiorari Granted',r'The\s*judgment',r'Judgment\s*reversed',
                   r'Judgment\s*affirmed', 'directed to','and petition','and the petition',r'[cC]ases\s*are\s*consolidated',
           'The order', 'Order entered']  


#Split action_text into specific_actions 
docket_actions=[]
i=0
for thing in text2:
    parts = [thing]
    for action in actions:
        temp = []
        l= re.findall(action,thing)
        if l!= []:
            for part in parts:
                if len(set(l))==1:
                    ts = part.split(l[0])
                    j=0
                    for t in ts:
                        if j!=0:
                            temp.append(l[0]+t)
                        else:
                            temp.append(t)
                        j+=1
                    parts = temp
                else:
                    ts=[part]
                    for sep in set(l):
                        temp1=[]
                        for part in ts:
                            temp_parts = part.split(sep)
                            j=0
                            for t in temp_parts:
                                if j!=0:
                                    temp1.append(sep+t)
                                else:
                                    temp1.append(t) 
                                j+=1
                        ts=temp1
                    parts = ts
    t=parts[1:]
    docket_actions.append((i,t))
    i+=1

data['Action_Text']=text2

dockets = []
terms = []
dates = []
actions = []
texts = []
cleaned_dockets = []
action_texts = []
specific_actions = []
for i in range(len(data)):
    docket_no = data['Docket Number'][i]
    Term = data['Term'][i]
    Date = data['Date'][i]
    Action = data['Action'][i].replace('&','').replace('Opinion Per Curiam','Opinions Per Curiam')
    if Action=='Per Curiam':
        Action='Opinions Per Curiam'
    Text = data['Text'][i]
    Cleaned_Docket = data['Cleaned Docket'][i]
    Action_Text = data['Action_Text'][i]
    if docket_actions[i][1]!=[]:
        for a in docket_actions[i][1]:
            dockets.append(docket_no)
            terms.append(Term)
            dates.append(Date)
            actions.append(Action)
            texts.append(Text)
            cleaned_dockets.append(Cleaned_Docket)
            action_texts.append(Action_Text)
            specific_actions.append(a)
    else:
        dockets.append(docket_no)
        terms.append(Term)
        dates.append(Date)
        actions.append(Action)
        texts.append(Text)
        cleaned_dockets.append(Cleaned_Docket)
        action_texts.append(Action_Text)
        specific_actions.append('Missing')
df = pd.DataFrame({'docket_number':dockets, 'term':terms, 'date':dates,'action':actions,'text':texts,
                  'cleaned_docket':cleaned_dockets, 'action_text':action_texts,'specific_action':specific_actions})

df.to_excel('Data/Processed_Data/Docket_Day_Action.xlsx', index=False)
